{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用户和活动关联关系处理\n",
    "\n",
    "\n",
    "整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import pickle as cPickle\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    " \"\"\"\n",
    "我们只关心train和test中出现的user和event，因此重点处理这部分关联数据\n",
    "\n",
    "train.csv 有6列：\n",
    "user：用户ID\n",
    "event：活动ID\n",
    "invited：是否被邀请（0/1）\n",
    "timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同\n",
    " \"\"\"\n",
    "    \n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "path = 'E:/csdn/week4/homework4/'  \n",
    "for filename in [path+\"train.csv\", path+\"test.csv\"]:\n",
    "    f = open(filename, 'r',encoding='utf-8')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))\n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(path+\"train.csv\", 'r')\n",
    "ftrain.readline()\n",
    "for line in ftrain:\n",
    "    cols = line.strip().split(\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested\n",
    "    score = int(cols[4])\n",
    "    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0\n",
    "    #userEventScores[i, j] = -1\n",
    "    #else:\n",
    "    userEventScores[i, j] = score\n",
    "ftrain.close()\n",
    "\n",
    "  \n",
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "cPickle.dump(eventsForUser, open(\"PE_eventsForUser.pkl\", 'wb'))\n",
    "##统计活动参加的用户\n",
    "cPickle.dump(usersForEvent, open(\"PE_usersForEvent.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(\"PE_userEventScores\", userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "cPickle.dump(userIndex, open(\"PE_userIndex.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "cPickle.dump(eventIndex, open(\"PE_eventIndex.pkl\", 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "cPickle.dump(uniqueUserPairs, open(\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "cPickle.dump(uniqueEventPairs, open(\"PE_uniqueEventPairs.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读取训练集和测试集中出现过的活动列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of events in train & test :13418\n"
     ]
    }
   ],
   "source": [
    "eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\",\"rb\"))\n",
    "n_events = len(eventIndex)\n",
    "print(\"number of events in train & test :%d\" % n_events)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "fin = open(path+\"events.csv\",'r')\n",
    "import sys\n",
    "sys.path.append(r'E:/csdn/week4/homework4')\n",
    "from ev_utils import FeatureEng\n",
    "FE = FeatureEng()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fin.readline()\n",
    "eventPropMatrix = ss.dok_matrix((n_events,7))\n",
    "eventContMatrix = ss.dok_matrix((n_events,101))\n",
    "for line in fin.readlines():\n",
    "    cols = line.strip().split(\",\")\n",
    "    eventId = str(cols[0])\n",
    "    if eventId in eventIndex:\n",
    "        i = eventIndex[eventId]\n",
    "        eventPropMatrix[i,0] = FE.getJoinedYearMonth(cols[2])\n",
    "        eventPropMatrix[i,1] = FE.getJoinedYearMonth(cols[3])\n",
    "        eventPropMatrix[i,2] = FE.getJoinedYearMonth(cols[4])\n",
    "        eventPropMatrix[i,3] = FE.getJoinedYearMonth(cols[5])\n",
    "        eventPropMatrix[i,4] = FE.getJoinedYearMonth(cols[6])\n",
    "        eventPropMatrix[i,5] = FE.getJoinedYearMonth(cols[7])\n",
    "        eventPropMatrix[i,6] = FE.getJoinedYearMonth(cols[8])\n",
    "        for j in range(9,109):\n",
    "            eventContMatrix[i,j-9] = cols[j]\n",
    "fin.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "eventPropMatrix = normalize(eventPropMatrix,norm = \"l2\",copy = False)\n",
    "sio.mmwrite(\"EV_eventPropMatrix\",eventPropMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "eventContMatrix = normalize(eventContMatrix,norm = \"l2\",copy = False)\n",
    "sio.mmwrite(\"EV_eventContMatrix\",eventContMatrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "eventPropSim = ss.dok_matrix((n_events,n_events))\n",
    "eventContSim = ss.dok_matrix((n_events,n_events))\n",
    "uniqueEventPairs = cPickle.load(open(\"PE_uniqueEventPairs.pkl\", 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:698: RuntimeWarning: invalid value encountered in double_scalars\n",
      "  dist = 1.0 - uv / np.sqrt(uu * vv)\n"
     ]
    }
   ],
   "source": [
    "for e1,e2 in uniqueEventPairs:\n",
    "    i = e1\n",
    "    j = e2\n",
    "    \n",
    "    if (i,j) not in eventPropSim:\n",
    "        epsim = ssd.correlation(eventPropMatrix.getrow(i).todense(),\n",
    "             eventPropMatrix.getrow(j).todense())\n",
    "        eventPropSim[i, j] = epsim\n",
    "        eventPropSim[j, i] = epsim\n",
    "        \n",
    "    if (i,j) not in eventContSim:\n",
    "        ecsim = ssd.cosine(eventContMatrix.getrow(i).todense(),\n",
    "              eventContMatrix.getrow(j).todense())\n",
    "\n",
    "        eventContSim[i, j] = epsim\n",
    "        eventContSim[j, i] = epsim\n",
    "\n",
    "    sio.mmwrite(\"EV_eventPropSim\", eventPropSim)\n",
    "    sio.mmwrite(\"EV_eventContSim\", eventContSim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import metrics\n",
    "from sklearn.cluster import MiniBatchKMeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def K_cluster_analysis(K, df):\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    km = MiniBatchKMeans(n_clusters = K)\n",
    "    km.fit(df)\n",
    "    \n",
    "    cluster_result = km.predict(df)\n",
    "\n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(df,cluster_result)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.17245304529766448\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.1588069607824103\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.16286852342848782\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.122746835718685\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.14334810690163033\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.1342634695950887\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.10615355489580804\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.1422592968009799\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.12898888084076382\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.13536838448055888\n"
     ]
    }
   ],
   "source": [
    "Ks = [10, 20, 30,40,50,60,70,80,90,100]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, eventContMatrix)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
